Data¶

Imports¶

In [1]:
import os
import pandas as pd
import numpy as np

Merging data from 3 csvs¶

In [2]:
import pandas as pd

# Load the CSV files
df_ss = pd.read_csv('spreadspoke_scores.csv')
df_teams = pd.read_csv('nfl_teams.csv')
df_stads = pd.read_csv('nfl_stadiums.csv', encoding='ISO-8859-1')

df_teams = df_teams.rename(columns={'team_name':'team_home'})
merged_df = pd.merge(df_teams, df_ss, on='team_home', how='inner')# Print the column names of both DataFrames

df_stads = df_stads.rename(columns={'stadium_name': 'stadium'})
df = pd.merge(merged_df, df_stads, on='stadium', how='inner')

## Schedule date transformed to date type, sorted df by schedule date
df['schedule_date'] = pd.to_datetime(df['schedule_date'], format='%m/%d/%Y', errors='coerce')
df = df.sort_values(by='schedule_date').reset_index(drop=True)

## This value should be numeric but is object type
df['over_under_line'] = pd.to_numeric(df['over_under_line'], errors='coerce')


df.tail()
Out[2]:
team_home team_name_short team_id team_id_pfr team_conference team_division team_conference_pre2002 team_division_pre2002 schedule_date schedule_season ... stadium_weather_station_zipcode stadium_weather_type stadium_capacity stadium_surface stadium_weather_station stadium_weather_station_name stadium_latitude stadium_longitude stadium_azimuthangle stadium_elevation
14067 Indianapolis Colts Colts IND CLT AFC AFC South AFC AFC East 2025-01-05 2024 ... 46225 indoor 63,000 FieldTurf US1INMR0076 INDIANAPOLIS 6.8 NNE, IN US 39.760056 -86.162806 26.6 227.1
14068 New York Jets Jets NYJ NYJ NFC AFC East NFC AFC East 2025-01-05 2024 ... 7073 cold 82,500 FieldTurf USW00014734 NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US 40.813528 -74.074361 345.5 2.1
14069 Baltimore Ravens Ravens BAL RAV AFC AFC North AFC AFC Central 2025-01-05 2024 ... 21230 moderate 71,008 FieldTurf USW00093721 BALTIMORE WASHINGTON INTERNATIONAL AIRPORT, MD US 39.278056 -76.622778 289.5 47.5
14070 Green Bay Packers Packers GB GNB NFC NFC North NFC NFC Central 2025-01-05 2024 ... 54304 cold 80,735 Grass USW00014898 GREEN BAY AUSTIN STRAUBEL INTERNATIONAL AIRPOR... 44.501389 -88.062222 0.0 209.4
14071 Tennessee Titans Titans TEN OTI AFC AFC South AFC AFC Central 2025-01-05 2024 ... 37213 moderate 69,143 Grass USW00013897 NASHVILLE INTERNATIONAL AIRPORT, TN US 36.166389 -86.771389 334.6 182.9

5 rows × 39 columns

Object Data¶

In [3]:
df_cat = df.select_dtypes(include=['object', 'category'])
df_cat = df_cat.fillna('Unknown')
df_cat.drop(columns=['team_name_short', 'team_id', 'team_id_pfr','stadium_address','stadium_weather_station_zipcode','stadium_weather_station_name','stadium_weather_station'], inplace=True)  ### IRRELEVANT
df_cat.dtypes
Out[3]:
team_home                  object
team_conference            object
team_division              object
team_conference_pre2002    object
team_division_pre2002      object
schedule_week              object
team_away                  object
team_favorite_id           object
stadium                    object
weather_detail             object
stadium_location           object
stadium_type               object
stadium_weather_type       object
stadium_capacity           object
stadium_surface            object
dtype: object

Float Data¶

In [4]:
df_num = df.select_dtypes(include=['float64'])
df_num[df_num.columns] = df_num.fillna(df_num.mean())
df_num.head()
Out[4]:
score_home score_away spread_favorite over_under_line weather_temperature weather_wind_mph weather_humidity stadium_open stadium_close stadium_latitude stadium_longitude stadium_azimuthangle stadium_elevation
0 14.0 23.0 -5.372946 42.282329 83.0 6.0 71.0 1946.000000 1986.000000 25.790500 -80.316300 192.485631 8.800000
1 45.0 7.0 -5.372946 42.282329 81.0 7.0 70.0 1978.466085 2002.689877 38.021551 -89.763382 192.485631 187.972944
2 27.0 7.0 -5.372946 42.282329 70.0 7.0 82.0 1978.466085 2002.689877 38.021551 -89.763382 192.485631 187.972944
3 14.0 19.0 -5.372946 42.282329 82.0 11.0 78.0 1946.000000 1986.000000 25.790500 -80.316300 192.485631 8.800000
4 31.0 0.0 -5.372946 42.282329 77.0 6.0 82.0 1978.466085 2002.689877 38.021551 -89.763382 192.485631 187.972944

Boolean Data¶

In [5]:
df_bool = df.select_dtypes(include=[bool]).astype(int)
df_bool
Out[5]:
schedule_playoff stadium_neutral
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0
... ... ...
14067 0 0
14068 0 0
14069 0 0
14070 0 0
14071 0 0

14072 rows × 2 columns

Integer Data¶

In [6]:
df_int = df.select_dtypes(include=['int64'])
df_int
Out[6]:
schedule_season
0 1966
1 1966
2 1966
3 1966
4 1966
... ...
14067 2024
14068 2024
14069 2024
14070 2024
14071 2024

14072 rows × 1 columns

cleaned Data¶

In [7]:
nfl1 = pd.concat([df_cat, df_num], axis=1)
nfl2 = pd.concat([ df_int, df_bool],axis=1)
nfl = pd.concat([nfl1,nfl2],axis=1)
nfl['score_home'].isna().sum()
Out[7]:
0

Visualized Data¶

Violin Plot¶

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


df_cat['score_home'] = nfl['score_home']
df_cat['score_away'] = nfl['score_away']

for feature in df_cat.columns:
    if feature != 'score_home' and feature != 'score_away':
        plt.figure(figsize=(12, 12))
        
        ax1 = plt.subplot(2, 1, 1)
        sns.violinplot(data=df_cat, x=feature, y='score_home', ax=ax1, color='red')
        ax1.set_title(f'home score and away score vs {feature}', fontsize=16)
        ax1.set_xlabel('')
        ax1.set_ylabel('home score', fontsize=14)
        ax1.set_xticklabels([])  # Remove x-axis tick labels for score_home plot
        ax1.grid(True)

        ax2 = plt.subplot(2, 1, 2)
        sns.violinplot(data=df_cat, x=feature, y='score_away', ax=ax2, color='blue')
        ax2.set_xlabel(feature, fontsize=14)
        ax2.set_ylabel('away score', fontsize=14)
        ax2.tick_params(axis='x', rotation=45)
        ax2.grid(True)
        
        plt.tight_layout()
        plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [9]:
df_num.dtypes
Out[9]:
score_home              float64
score_away              float64
spread_favorite         float64
over_under_line         float64
weather_temperature     float64
weather_wind_mph        float64
weather_humidity        float64
stadium_open            float64
stadium_close           float64
stadium_latitude        float64
stadium_longitude       float64
stadium_azimuthangle    float64
stadium_elevation       float64
dtype: object
In [10]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select columns for the pair plot
columns_to_plot = ['score_home', 'score_away', 'over_under_line', 'spread_favorite', 'weather_temperature', 'weather_wind_mph', 'weather_humidity']

# Create a new DataFrame with only the selected columns
plot_df = nfl[columns_to_plot]

# Create the pair plot
sns.pairplot(plot_df)
plt.suptitle('Pair Plot of Selected Columns', y=1.02)
plt.show()
No description has been provided for this image

Randon Forest Model¶

In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import time

# Measure runtime
start_time = time.time()

# Separate the last 271 rows for prediction
nfl_train = nfl.iloc[:-271]  
nfl_predict = nfl.iloc[-271:]  

# Convert categorical columns to numerical using one-hot encoding
nfl_train_encoded = pd.get_dummies(nfl_train)
nfl_predict_encoded = pd.get_dummies(nfl_predict)

# Ensure both training and prediction data have the same columns
nfl_predict_encoded = nfl_predict_encoded.reindex(columns=nfl_train_encoded.columns, fill_value=0)

X_train = nfl_train_encoded.drop(columns=['score_home', 'score_away'])  
y_train_home = nfl_train_encoded['score_home']  
y_train_away = nfl_train_encoded['score_away']  

X_predict = nfl_predict_encoded.drop(columns=['score_home', 'score_away'])  

rf_model_home = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_away = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_home.fit(X_train, y_train_home)
rf_model_away.fit(X_train, y_train_away)
predicted_home_scores = rf_model_home.predict(X_predict)
predicted_away_scores = rf_model_away.predict(X_predict)
nfl_predict['predicted_score_home'] = predicted_home_scores
nfl_predict['predicted_score_away'] = predicted_away_scores

# Calculate Mean Squared Error
mse_home = mean_squared_error(nfl_predict['score_home'], predicted_home_scores)
mse_away = mean_squared_error(nfl_predict['score_away'], predicted_away_scores)

# Measure runtime
end_time = time.time()
runtime = end_time - start_time

print(nfl_predict[['predicted_score_home', 'predicted_score_away']])
print(f"Mean Squared Error (Home Scores): {mse_home}")
print(f"Mean Squared Error (Away Scores): {mse_away}")
print(f"Runtime: {runtime:.2f} seconds")


#######  OUTPUT  #######
#   predicted_score_home  predicted_score_away
#   13801                 17.57                 20.50
#   13802                 19.12                 18.85
#   13803                 19.31                 19.86
#   13804                 23.37                 18.59
#   13805                 21.11                 16.22
#   ...                     ...                   ...
#   14067                 23.76                 19.86
#   14068                 16.41                 21.15
#   14069                 21.40                 21.36
#   14070                 19.96                 18.44
#   14071                 21.88                 18.90
#   
#   
#   
#   [271 rows x 2 columns]
#   Mean Squared Error (Home Scores): 6.841222137206541
#   Mean Squared Error (Away Scores): 2.8940572362235955
#   Runtime: 76.65 seconds
       predicted_score_home  predicted_score_away
13801                 17.57                 20.50
13802                 19.12                 18.85
13803                 19.31                 19.86
13804                 23.37                 18.59
13805                 21.11                 16.22
...                     ...                   ...
14067                 23.76                 19.86
14068                 16.41                 21.15
14069                 21.40                 21.36
14070                 19.96                 18.44
14071                 21.88                 18.90

[271 rows x 2 columns]
Mean Squared Error (Home Scores): 6.841222137206541
Mean Squared Error (Away Scores): 2.8940572362235955
Runtime: 77.36 seconds
/var/folders/xc/yrj64zn13bjfv7vdfm67_kb40000gn/T/ipykernel_10625/4112753878.py:32: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_predict['predicted_score_home'] = predicted_home_scores
/var/folders/xc/yrj64zn13bjfv7vdfm67_kb40000gn/T/ipykernel_10625/4112753878.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_predict['predicted_score_away'] = predicted_away_scores

Predicted Scores¶

In [12]:
original_info = df.tail(271)[['schedule_date', 'team_home', 'team_away']].reset_index(drop=True)

# Combine the original info with predictions
result_df = pd.concat([original_info, nfl_predict[['predicted_score_home', 'predicted_score_away']].reset_index(drop=True)], axis=1)

result_df
Out[12]:
schedule_date team_home team_away predicted_score_home predicted_score_away
0 2024-09-05 Kansas City Chiefs Baltimore Ravens 17.57 20.50
1 2024-09-08 Tampa Bay Buccaneers Washington Commanders 19.12 18.85
2 2024-09-08 Cincinnati Bengals New England Patriots 19.31 19.86
3 2024-09-08 Buffalo Bills Arizona Cardinals 23.37 18.59
4 2024-09-08 Los Angeles Chargers Las Vegas Raiders 21.11 16.22
... ... ... ... ... ...
266 2025-01-05 Indianapolis Colts Jacksonville Jaguars 23.76 19.86
267 2025-01-05 New York Jets Miami Dolphins 16.41 21.15
268 2025-01-05 Baltimore Ravens Cleveland Browns 21.40 21.36
269 2025-01-05 Green Bay Packers Chicago Bears 19.96 18.44
270 2025-01-05 Tennessee Titans Houston Texans 21.88 18.90

271 rows × 5 columns

Plot of Predicted Scores¶

In [13]:
plt.figure(figsize=(15, 8))
plt.scatter(result_df['schedule_date'], result_df['predicted_score_home'], label='Predicted Score Home', color='blue', marker='o', alpha=0.7)
plt.scatter(result_df['schedule_date'], result_df['predicted_score_away'], label='Predicted Score Away', color='green', marker='x', alpha=0.7)

plt.title('Predicted Scores vs. Schedule Date')
plt.xlabel('Schedule Date')
plt.ylabel('Predicted Score')
plt.legend()
plt.grid(True)

plt.show()
No description has been provided for this image

Residuals Distribution¶

In [14]:
errors_home = y_train_home - rf_model_home.predict(X_train)
errors_away = y_train_away - rf_model_away.predict(X_train)

plt.figure(figsize=(10, 6))
sns.histplot(errors_home, color='blue', label='Home Score Errors', kde=True)
sns.histplot(errors_away, color='green', label='Away Score Errors', kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image

Predicted vs Training Data¶

In [15]:
import matplotlib.pyplot as plt

# Assuming you have actual scores in `nfl_train` for training predictions
plt.figure(figsize=(10, 6))
plt.scatter(nfl_train['score_home'], rf_model_home.predict(X_train), alpha=0.5, color='blue', label='Home Scores')
plt.scatter(nfl_train['score_away'], rf_model_away.predict(X_train), alpha=0.5, color='green', label='Away Scores')
plt.plot([nfl_train['score_home'].min(), nfl_train['score_home'].max()], [nfl_train['score_home'].min(), nfl_train['score_home'].max()], 'r--', label='Ideal Fit')
plt.xlabel('Actual Scores')
plt.ylabel('Predicted Scores')
plt.title('Predicted vs Actual Scores From Training Data')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image

Distribution of Scores¶

In [16]:
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(nfl_train['score_home'], color='blue', label='Actual Home Scores', kde=True)
sns.histplot(rf_model_home.predict(X_train), color='cyan', label='Predicted Home Scores', kde=True)
plt.xlabel('Scores')
plt.ylabel('Frequency')
plt.title('Distribution of Actual vs Predicted Home Scores')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [17]:
nfl_predict
Out[17]:
team_home team_conference team_division team_conference_pre2002 team_division_pre2002 schedule_week team_away team_favorite_id stadium weather_detail ... stadium_close stadium_latitude stadium_longitude stadium_azimuthangle stadium_elevation schedule_season schedule_playoff stadium_neutral predicted_score_home predicted_score_away
13801 Kansas City Chiefs AFC AFC West AFC AFC West 1 Baltimore Ravens Unknown GEHA Field at Arrowhead Stadium Unknown ... 2002.689877 39.048889 -94.483889 316.300000 264.900000 2024 0 0 17.57 20.50
13802 Tampa Bay Buccaneers NFC NFC South NFC NFC Central 1 Washington Commanders Unknown Raymond James Stadium Unknown ... 2016.000000 27.975833 -82.503333 0.000000 5.800000 2024 0 0 19.12 18.85
13803 Cincinnati Bengals AFC AFC North AFC AFC Central 1 New England Patriots Unknown Paycor Stadium Unknown ... 2002.689877 39.100000 -84.516670 192.485631 193.900000 2024 0 0 19.31 19.86
13804 Buffalo Bills AFC AFC East AFC AFC East 1 Arizona Cardinals Unknown Highmark Stadium Unknown ... 2002.689877 42.889000 -78.890100 192.485631 178.000000 2024 0 0 23.37 18.59
13805 Los Angeles Chargers AFC AFC West AFC AFC West 1 Las Vegas Raiders Unknown SoFi Stadium indoor ... 2002.689877 33.953450 -118.339200 338.200000 187.972944 2024 0 0 21.11 16.22
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14067 Indianapolis Colts AFC AFC South AFC AFC East 18 Jacksonville Jaguars Unknown Lucas Oil Stadium indoor ... 2002.689877 39.760056 -86.162806 26.600000 227.100000 2024 0 0 23.76 19.86
14068 New York Jets NFC AFC East NFC AFC East 18 Miami Dolphins Unknown MetLife Stadium Unknown ... 2002.689877 40.813528 -74.074361 345.500000 2.100000 2024 0 0 16.41 21.15
14069 Baltimore Ravens AFC AFC North AFC AFC Central 18 Cleveland Browns Unknown M&T Bank Stadium Unknown ... 2002.689877 39.278056 -76.622778 289.500000 47.500000 2024 0 0 21.40 21.36
14070 Green Bay Packers NFC NFC North NFC NFC Central 18 Chicago Bears Unknown Lambeau Field Unknown ... 2002.689877 44.501389 -88.062222 0.000000 209.400000 2024 0 0 19.96 18.44
14071 Tennessee Titans AFC AFC South AFC AFC Central 18 Houston Texans Unknown Nissan Stadium Unknown ... 2016.000000 36.166389 -86.771389 334.600000 182.900000 2024 0 0 21.88 18.90

271 rows × 33 columns

In [ ]: